exercise-05

Author

John Hinkle & Yen Do

Exercise 05, Challenge 1

Step 1: Load packages and data

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
d <- read.csv("https://raw.githubusercontent.com/difiore/ada-datasets/main/IMDB-movies.csv")

Step 2: Filter data set

df <- filter(d, runtimeMinutes >= 60, runtimeMinutes <= 180 & startYear >= 1920, startYear <= 1979) |> mutate(decade = paste0(floor(((startYear-1900)/10))*10,"s"))
nrow(df)
[1] 5651

Step 3: Generate histograms of movie run time for each decade

library(ggplot2)
ggplot(df, aes(x=runtimeMinutes)) +
  geom_histogram(binwidth=15) +
  facet_wrap(~decade)

Step 4: Calculate mean runtime and standard deviation for each decade

print(results <- group_by(df, decade) |>
  summarise(runtime_mean = mean(runtimeMinutes), runtime_sd = sd(runtimeMinutes)))
# A tibble: 6 × 3
  decade runtime_mean runtime_sd
  <chr>         <dbl>      <dbl>
1 20s            96.3       26.2
2 30s            90.3       17.3
3 40s            97.2       19.1
4 50s            98.9       19.2
5 60s           106.        21.2
6 70s           104.        18.0

Step 5: Draw a single sample of 100 movies per decade

sample <- df |>
  group_by(decade) |>
  slice_sample(n=100) |>
  summarise(sample_mean = mean(runtimeMinutes), 
            sample_sd = sd(runtimeMinutes)
            )

Step 6: Calculate standard error

sample <- sample |>
  mutate(sample_se = sample_sd / sqrt(100))

Step 7: Compare estimates to actual population values

comparison <- left_join(sample, results, by = "decade") |>
  mutate(
    runtime_se = runtime_sd / sqrt(100),
    diff_mean = abs(sample_mean - runtime_mean),
    diff_se = abs(sample_se - runtime_se)
  )

Step 8: Generate a sampling distribution

library(infer)
sampling_dist <- df |>
  group_by(decade) |>
  rep_sample_n(size = 100, reps = 1000, replace = FALSE) |>
  group_by(decade, replicate) |>
  summarise(samp_dist_mean = mean(runtimeMinutes),
            samp_dist_sd = sd(runtimeMinutes),
            .groups = "drop")

Step 9: Calculate the mean and the standard deviation of the sampling distribution of sample means for each decade

sampling_stats <- sampling_dist |>
  group_by(decade) |>
  summarise(samp_dis_mean = mean(samp_dist_mean),
            sam_dis_sd = sd(samp_dist_mean)
  )

ggplot(sampling_dist, aes(x=samp_dist_mean)) +
  geom_histogram(binwidth=1) +
  facet_wrap(~decade)

The shape of plot: bell shape

Step 10: Compare standard errors

sampling_dist_2 <- sampling_dist |>
  group_by(decade) |>
  summarise(se_2 = sd(samp_dist_mean))

comparison2 <- comparison |>
  left_join(sampling_dist_2, by = "decade")

Exercise 05, Challenge 2

Step 1: Load data

library(tidyverse)
z <- read.csv("https://raw.githubusercontent.com/difiore/ada-datasets/main/zombies.csv")

Step 2: Calculate the population mean and standard deviation for each quantitative random variable

pop_sd <- function(x){
  sqrt(sum((x - mean(x))^2) / length(x))
}



print(z_summary <- 
        z |> 
        select(-id) |>
  summarize(across(where(is.numeric), list( 
    mean = ~mean(.), 
    sd = ~pop_sd(.)
    ), .names = "{.col}_{.fn}")))
  height_mean height_sd weight_mean weight_sd zombies_killed_mean
1     67.6301   4.30797    143.9075  18.39186               2.992
  zombies_killed_sd years_of_education_mean years_of_education_sd age_mean
1          1.747551                   2.996              1.675704 20.04696
    age_sd
1 2.963583

Step 3: Create box plots of quantitative random variables by gender

library(ggplot2)

# Height
height_boxplot <- ggplot(z, aes(x=gender, y=height)) +
  geom_boxplot(color = "#BF5700")
height_boxplot

# Weight
weight_boxplot <- ggplot(z, aes(x=gender, y=weight)) +
  geom_boxplot(color = "#BF5700")
weight_boxplot

# Zombies killed
kills_boxplot <- ggplot(z, aes(x=gender, y=zombies_killed)) +
  geom_boxplot(color = "#BF5700")
kills_boxplot

# Education
education_boxplot <- ggplot(z, aes(x=gender, y=years_of_education)) +
  geom_boxplot(color = "#BF5700")
education_boxplot

# Age
age_boxplot <- ggplot(z, aes(x=gender, y=age)) +
  geom_boxplot(color = "#BF5700")
age_boxplot

Step 4: Scatter plots of height and weight vs. age

It appears that there is a positive relationship between height vs. age, and weight vs. age. As you get older, you get taller and heavier. It is worth noting that the age axis only goes from 10 years old to 30 years old.

# Height
print(height_scatter <- ggplot(z, aes(x= age, y=height, color=gender)) +
  geom_point())

# Weight
print(weight_scatter <- ggplot(z, aes(x= age, y=weight, color=gender)) +
  geom_point())

Step 5a: Histograms of each random quantitative variable

‘zombies_killed’ and ‘years_of_education’ both appear to have non-normal distributions.

# Height
hist(z$height)

# Weight
hist(z$weight)

# Zombies killed
hist(z$zombies_killed)

# Years of education
hist(z$years_of_education)

# Age
hist(z$age)

Step 5b: Q-Q plots of each random quantitative variable

As with the histograms, ‘zombies_killed’ and ‘years_of_education’ both appear to have non-normal distributions.

# Height
qqnorm(z$height, main="Height")
qqline(z$height, col = "#bf5700")

# Weight
qqnorm(z$weight, main="Weight")
qqline(z$weight, col = "#bf5700")

# Zombies killed
qqnorm(z$zombies_killed, main="Zombies Killed")
qqline(z$zombies_killed, col = "#bf5700")

# Years of education
qqnorm(z$years_of_education, main="Years of Education")
qqline(z$years_of_education, col = "#bf5700")

# Age
qqnorm(z$age, main="Age")
qqline(z$age, col = "#bf5700")

Step 6: Sample one subset of 50 survivors, calculate summary statistics and CIs for each quantitative random variable

z_sample <- slice_sample(z, n=50)

# summary stats df
print(z_sample_summary <- 
        z_sample |> 
        select(-id) |>
  summarize(across(where(is.numeric), list( 
    mean = ~mean(.), 
    sd = ~sd(.),
    se = ~sd(.)/sqrt(length(.))
    ), .names = "{.col}_{.fn}")))
  height_mean height_sd height_se weight_mean weight_sd weight_se
1     66.7529  4.127384 0.5837002    140.3271  16.17735  2.287823
  zombies_killed_mean zombies_killed_sd zombies_killed_se
1                3.22          1.787599         0.2528047
  years_of_education_mean years_of_education_sd years_of_education_se age_mean
1                    3.12              1.858681             0.2628571 19.61069
    age_sd    age_se
1 2.705407 0.3826023
# 95% CI for height
height_ci <- z_sample_summary$height_mean + qnorm(p=c(0.025, 0.975))*z_sample_summary$height_se
cat("95% CI for height:", height_ci, "\n")
95% CI for height: 65.60887 67.89694 
# 95% CI for weight
weight_ci <- z_sample_summary$weight_mean + qnorm(p=c(0.025, 0.975))*z_sample_summary$weight_se
cat("95% CI for weight:", weight_ci, "\n")
95% CI for weight: 135.843 144.8111 
# 95% CI for zombies killed
kills_ci <- z_sample_summary$zombies_killed_mean + qnorm(p=c(0.025, 0.975))*z_sample_summary$zombies_killed_se
cat("95% CI for zombies killed:", kills_ci, "\n")
95% CI for zombies killed: 2.724512 3.715488 
# 95% CI for years of education
education_ci <- z_sample_summary$years_of_education_mean + qnorm(p=c(0.025, 0.975))*z_sample_summary$years_of_education_se
cat("95% CI for years of education:", education_ci, "\n")
95% CI for years of education: 2.604809 3.635191 
# # 95% CI for age
age_ci <- z_sample_summary$age_mean + qnorm(p=c(0.025, 0.975))*z_sample_summary$age_se
cat("95% CI for age:", age_ci)
95% CI for age: 18.8608 20.36058

Step 7: Create a sampling distribution and calculate summary statistics for it.

The means and standard deviations of the sampling distribution are similar to that of the single sample.

reps <- 199
zquant <- z[, sapply(z, is.numeric)] |> 
  select(-id)

sample_means <- matrix(NA, nrow = reps, ncol = ncol(zquant))

for (i in 1:reps) {
  sample_data <- zquant[sample(1:nrow(zquant), 50), ]
  sample_means[i, ] <- colMeans(sample_data)
}

sample_means <- as.data.frame(sample_means)
colnames(sample_means) <- c("height_mean", "weight_mean", "zombies_killed_mean", "years_of_education_mean", "age_mean")

# Create a df containing the means from the initial, single sample
single_sample <-data.frame(
  height_mean = z_sample_summary$height_mean,
  weight_mean = z_sample_summary$weight_mean,
  zombies_killed_mean = z_sample_summary$zombies_killed_mean,
  years_of_education_mean = z_sample_summary$years_of_education_mean,
  age_mean = z_sample_summary$age_mean
)

# Add that initial sample to the other 199
sample_means <- rbind(sample_means, single_sample)

# Get the mean and standard deviation for each variable
print(sampling_distribution_summary_stats <- data.frame(
  mean = colMeans(sample_means),
  sd = apply(sample_means, 2, sd)
))
                             mean        sd
height_mean              67.57518 0.6187562
weight_mean             143.73622 2.6616763
zombies_killed_mean       3.01030 0.2245619
years_of_education_mean   3.00300 0.2413259
age_mean                 20.06118 0.4060139

Step 8a: Histograms for the sampling distribution

Based on the histograms, all of the variables appear to have roughly normally distributed sampling distributions. This includes the ‘zombies_killed’ and ‘years_of_education’ variables which previosuly did not have normal distributions.

# Height mean
hist(sample_means$height_mean)

# Weight mean
hist(sample_means$weight_mean)

# Zombies killed mean
hist(sample_means$zombies_killed_mean)

# Years of education mean
hist(sample_means$years_of_education_mean)

# Age mean
hist(sample_means$age_mean)

Step 8b: Q-Q plots for the sampling distribution

As with the histograms, all of the variables appear to have roughly normally distributed sampling distributions. This includes the ‘zombies_killed’ and ‘years_of_education’ variables which previously did not have normal distributions.

# Height
qqnorm(sample_means$height_mean, main="Height mean")
qqline(sample_means$height_mean, col = "#bf5700")

# Weight
qqnorm(sample_means$weight_mean, main="Weight mean")
qqline(sample_means$weight_mean, col = "#bf5700")

# Zombies killed
qqnorm(sample_means$zombies_killed_mean, main="Zombies Killed mean")
qqline(sample_means$zombies_killed_mean, col = "#bf5700")

# Years of education
qqnorm(sample_means$years_of_education_mean, main="Years of Education mean")
qqline(sample_means$years_of_education_mean, col = "#bf5700")

# Age
qqnorm(sample_means$age_mean, main="Age mean")
qqline(sample_means$age_mean, col = "#bf5700")

Step 9: Confidence Intervals of Sampling Distribution

The confidence intervals generated here are “narrower” than the confidence intervals generated previously (ex. in Step 7) , with the lower (2.5%) bound closer to the upper(97.5%) bound than in Step 9.

# Height
print('Height means CI')
[1] "Height means CI"
quantile(sample_means$height_mean, c(0.025, 0.975))
    2.5%    97.5% 
66.53361 68.73822 
# Weight
print('Weight means CI')
[1] "Weight means CI"
quantile(sample_means$weight_mean, c(0.025, 0.975))
    2.5%    97.5% 
138.9718 148.5376 
# Zombies killed
print('Zombies killed means CI')
[1] "Zombies killed means CI"
quantile(sample_means$zombies_killed_mean, c(0.025, 0.975))
  2.5%  97.5% 
2.6000 3.4605 
# Years of education
print('Years of education means CI')
[1] "Years of education means CI"
quantile(sample_means$years_of_education_mean, c(0.025, 0.975))
 2.5% 97.5% 
 2.56  3.48 
# Age
print('Age means CI')
[1] "Age means CI"
quantile(sample_means$age_mean, probs = c(0.025, 0.975))
    2.5%    97.5% 
19.28069 20.93502 

Step 10: Bootstrapping to generate a 95% CI

The confidence intervals generated by bootstrapping are “narrower” than the confidence intervals generated in Step 9, with the lower (2.5%) bound closer to the upper(97.5%) bound than in Step 9.

n_boot=1000
n <- length(sample_means$height_mean)


boot <- vector()
for (i in 1:n_boot){
  boot[[i]] <- mean(sample(sample_means$height_mean, n, replace = TRUE))
}
quantile(probs = c(0.025, 0.975),boot)
    2.5%    97.5% 
67.49173 67.65546 
boot <- vector()
for (i in 1:n_boot){
  boot[[i]] <- mean(sample(sample_means$weight_mean, n, replace = TRUE))
}
quantile(probs = c(0.025, 0.975),boot)
    2.5%    97.5% 
143.3647 144.1096 
boot <- vector()
for (i in 1:n_boot){
  boot[[i]] <- mean(sample(sample_means$zombies_killed_mean, n, replace = TRUE))
}
quantile(probs = c(0.025, 0.975),boot)
    2.5%    97.5% 
2.979398 3.038205 
boot <- vector()
for (i in 1:n_boot){
  boot[[i]] <- mean(sample(sample_means$years_of_education_mean, n, replace = TRUE))
}
quantile(probs = c(0.025, 0.975),boot)
    2.5%    97.5% 
2.967690 3.035708 
n_boot <- 1000
boot <- vector()
for (i in 1:n_boot){
  boot[[i]] <- mean(sample(sample_means$age_mean, n, replace = TRUE))
}
quantile(probs = c(0.025, 0.975),boot)
    2.5%    97.5% 
20.00743 20.12254